#Importing the required libraries
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.5 ✔ purrr 0.3.4
## ✔ tibble 3.1.4 ✔ dplyr 1.0.7
## ✔ tidyr 1.1.4 ✔ stringr 1.4.0
## ✔ readr 2.0.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.1.2
library(RSpectra)
## Warning: package 'RSpectra' was built under R version 4.1.2
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.1.2
## randomForest 4.7-1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.1.2
## Loading required package: rpart
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:randomForest':
##
## combine
## The following object is masked from 'package:dplyr':
##
## combine
library(tictoc)
library(nnet) #for multinomial logistic regression
## Warning: package 'nnet' was built under R version 4.1.2
library(caret)
## Warning: package 'caret' was built under R version 4.1.2
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
# Loading the Dataset
df<- read.csv("/Users/aishwaryasaibewar/Documents/SeattleUniversity-MSDS/Courses/SU Course Work/SPRING_2023/Statistical Machine Learning 2/Homework/Homework4/spotify.csv", header = TRUE, sep = ",")
#Find the unique genres in the dataset
unique(df$genre)
## [1] "Dark Trap" "Underground Rap" "Trap Metal" "Emo"
## [5] "Rap" "RnB" "Pop" "Hiphop"
## [9] "techhouse" "techno" "trance" "psytrance"
## [13] "trap" "dnb" "hardstyle"
#Column names in the dataframe
colnames(df)
## [1] "danceability" "energy" "key" "loudness"
## [5] "speechiness" "acousticness" "instrumentalness" "liveness"
## [9] "valence" "tempo" "duration_ms" "genre"
## [13] "song_name"
#Since few songs were categorized in multiple genres in the original dataset. Therefore, songs with distinct genres were considered for the analysis.
df <- df %>%
distinct(song_name, .keep_all = TRUE)
##Find the count and the names of unique genres in the dataset
length(unique(df$genre))
## [1] 8
unique(df$genre)
## [1] "Dark Trap" "Underground Rap" "Trap Metal" "Emo"
## [5] "Rap" "RnB" "Pop" "Hiphop"
#For observing well separated clusters considered 3 genres
Spotify <- df%>% filter(genre %in% c("Rap","Dark Trap","Emo"))
#The unique genres in the dataset
unique(Spotify$genre)
## [1] "Dark Trap" "Emo" "Rap"
# Considering sample of observations from dataset for each genre.
# Split the data by genre
category_splits <- split(Spotify, Spotify$genre)
set.seed(6)
# Choose a sample(450) of observations from each genre
category_samples <- lapply(category_splits, function(category_subset) {
sample_n(category_subset, size = 450,replace=FALSE)
})
# Consolidate the samples into a single dataframe combined_data
combined_data <- bind_rows(category_samples)
#Cross verify the length of the data
length(unique(combined_data$song_name))
## [1] 1350
#Names of songs
rownames(combined_data) <- combined_data$song_name
#Create dataset spotify_df without genre,song_name as we are performing unsupervised learning on unlabelled data.
spotify_df <- combined_data %>% select(-c(genre,song_name))
head(spotify_df)
# Compute the principal components using prcomp()
pr.out <- prcomp(spotify_df, scale=TRUE)
#Fetch the variance captured by each principal component
pve <- data.frame(var = pr.out$sdev^2/sum(pr.out$sdev^2))
pve$id <- as.integer(row.names(pve))
#Plot the Proportion variance explained
#Left plot shows the proportion of information that each individual Principal Component contains, and the right plot shows the collective information represented when each Principal Component is added.
p1 <- ggplot(pve, aes(x=id, y=var)) +
geom_point()+
geom_line()+
labs(x='Principal component r',
y='Proportion variance explained',
title='Each')
p2 <- ggplot(pve, aes(x=id, y=cumsum(var)))+
geom_point()+geom_line()+
labs(x='Principal Components 1:r',
y='',
title='Cumulative Sum')
grid.arrange(p1, p2, ncol=2)
Principal component Analysis was performed for dimensionality reduction of the dataset and to determine the principal components that capture most of the variance in the data. A scree plot, with the number of principal components on the x-axis and the proportion of variance explained on the y-axis, was created, as depicted in Figure 2. It can be observed that at least 8 principal components were required to capture 90% of the variance in the dataset. Therefore, the dataset that included 13 variables was reduced to lower dimensional data which includes 8 principal components. This low-dimensional data was then used for developing machine learning models.
#Fetch the variance captured by each principal component
var = pr.out$sdev^2/sum(pr.out$sdev^2)
var
## [1] 0.22198474 0.17307526 0.09943428 0.09221496 0.08833936 0.08377145
## [7] 0.06845697 0.06453782 0.05210761 0.03791024 0.01816730
For the first principal component, having energy, loudness, and acousticness are all highly weighted.
#Fetch the important variables
pr.out$rotation[,1] %>% abs() %>% sort(decreasing=TRUE)
## energy loudness acousticness valence
## 0.57971880 0.54432103 0.41356430 0.24122344
## danceability liveness instrumentalness tempo
## 0.21391197 0.20333817 0.16101940 0.11247155
## duration_ms speechiness key
## 0.08569388 0.05902365 0.04367479
For the second principal danceability, valence, speechiness and instrumentalness are most important.
pr.out$rotation[,2] %>% abs() %>% sort(decreasing=TRUE)
## danceability speechiness valence instrumentalness
## 0.49921086 0.47730137 0.43324741 0.41337462
## duration_ms tempo loudness liveness
## 0.29420192 0.25357257 0.07574100 0.06968706
## energy acousticness key
## 0.04058611 0.03291557 0.01457916
# Get all the principal components
principal_components <- pr.out$x
# Choose the number of components as 8 as 90% of variance is captured when there are 8 components
components_count <- 8
# Fetch the principal components
prin_components_final <- principal_components[, 1:components_count]
# Create a lower-dimensional data frame with selected components and genre
low_dim_spotify_data <- data.frame(prin_components_final)
#Plot the PC1 against PC2 as they capture most of the variance
Genre <- combined_data$genre
levels(Genre) <- c(levels(Genre))
low_dim_spotify_data$Genre <- Genre
fig1 <- plot_ly(data=low_dim_spotify_data)
fig1 <- fig1 %>% add_markers(x=~PC1, y = ~PC2, color=~Genre, text = ~paste(Genre), hoverinfo = 'text',)
fig1 <- fig1 %>% layout(legend=list(title=list(text='Private')))
fig1
The genre Emo, Rap, and Dark Trap are separable from each other in the low-dimensional space. In this plot, we can see that the PCA has captured underlying patterns between the three genres from the variables available. As can be observed, all three genres share some audio characteristics. “Emo” and “Rap” are more tightly clustered because, in comparison to other genres, they have distinct audio characteristics. While certain songs in the “Dark Trap” and “Emo” genre overlap, they overlap showing that they have more comparable audio features. Machine learning models have a hard time predicting the differences between Emo and Rap due to the similarity of the features.
#Perform Kmeans on the low dimensionality data from PCA
set.seed(2023)
tic()
#Specify number of clusters as 3 and number of iterations as 30
kmeans_pca <- kmeans(select(low_dim_spotify_data, -c(Genre)),centers = 3, nstart = 30)
low_dim_spotify_data$clusters = as.factor(kmeans_pca$cluster)
toc()
## 0.035 sec elapsed
#Plot the PC1 against PC2 as they capture most of the variance
ggplot(low_dim_spotify_data, aes(x=PC1, y=PC2, color=clusters, shape= Genre)) + geom_point()
low_dim_spotify_data$pcalabels = ifelse(low_dim_spotify_data$clusters == 3, 'Dark Trap',
ifelse(low_dim_spotify_data$clusters == 2, 'Emo',
'Rap' ))
low_dim_spotify_data$different = as.factor(ifelse(low_dim_spotify_data$pcalabels == low_dim_spotify_data$Genre, 0, 1))
#Plot to get the genres of the songs that differ from the actual song genre
ggplot(low_dim_spotify_data, aes(x=PC1, y=PC2, color=Genre, alpha=different)) + geom_point()
## Warning: Using alpha for a discrete variable is not advised.
sum(low_dim_spotify_data$different==1)/length(low_dim_spotify_data$different)
## [1] 0.7807407
The genre Emo, Rap, and Dark Trap are separable from each other in the low-dimensional space. In this plot, we can see that the PCA has captured underlying patterns between the three genres from the variables available. As can be observed, all three genres share some audio characteristics. “Emo” and “Rap” are more tightly clustered because, in comparison to other genres, they have distinct audio characteristics. While certain songs in the “Dark Trap” and “Emo” genre overlap, they overlap showing that they have more comparable audio features. Machine learning models have a hard time predicting the differences between Emo and Rap due to the similarity of the features.
table(low_dim_spotify_data$Genre)
##
## Dark Trap Emo Rap
## 450 450 450
twss<- kmeans_pca$tot.withinss
cat("Total within-group sum of squares is ", twss)
## Total within-group sum of squares is 9553.883
#Hirearchial clustering
# Set the desired sample size
set.seed(2)
sample_size <- 100
# Randomly select observations
subset_data <- low_dim_spotify_data[sample(nrow(low_dim_spotify_data), sample_size), ]
#Create dataset dendro_data without genre,song_name as we are performing unsupervised learning on unlabelled data.
dendro_data<-subset_data %>% select(-c(Genre))
#Perform hierarchical clustering with complete,average, single and ward linkage
hc.complete <- hclust(dist(dendro_data), method = "complete")
## Warning in dist(dendro_data): NAs introduced by coercion
hc.average <- hclust(dist(dendro_data), method = "average")
## Warning in dist(dendro_data): NAs introduced by coercion
hc.single <- hclust(dist(dendro_data), method = "single")
## Warning in dist(dendro_data): NAs introduced by coercion
hc.ward <- hclust(dist(dendro_data), method = "ward.D2")
## Warning in dist(dendro_data): NAs introduced by coercion
#Plot the dendrograms
plot(hc.complete, hang = -1, main = "Complete Linkage",
xlab = "", sub = "", cex = .4)
plot(hc.average, hang = -1, main = "Average Linkage",
xlab = "", sub = "", cex = .4)
plot(hc.single, hang = -1, main = "Single Linkage",
xlab = "", sub = "", cex = .4)
plot(hc.ward, hang = -1, main = "Ward Linkage",
xlab = "", sub = "", cex = .4)
#Cut the dendrogram to have 3 clusters
dendro_cluster1 <- as.factor(cutree(hc.complete, 3))
# Create a data frame with the song names and their corresponding clusters
clustered_spotify <- data.frame(genre_name = row.names(subset_data), Cluster = dendro_cluster1)
# Print the songs in each cluster
for (i in 1:3) {
cat("Cluster", i, ":\n")
cat(paste(clustered_spotify$genre_name[clustered_spotify$Cluster == i],collapse = ", "), "\n\n")
}
## Cluster 1 :
## Banana Split (with YNW Melly feat. Lil Durk), Na Na Na (Na Na Na Na Na Na Na Na Na), Come As You Are, Hexed, iPHONE (with Nicki Minaj), Gone Bad, Zonder Jou (feat. Jairzinho), Make War, The Way She Feels, Communicate, Prove You Wrong, Boms Has Been Planted - Prod. JUNIOR FERRARI, Hemorrhage (In My Hands), Purple Emoji (feat. J. Cole), Never Sure, Starless, The Middle, R U Mine?, Interlude (Ora è la mia ora), Princess of Light, Dirty Pistola 2, King for a Day (feat. Kellin Quinn), Mountains, Suicide Doors (feat. Gunna), Drive, Side by Side, Better Than Me, Smoke Break, Het, When You See My Friends, Unity, Ready to Go (Get Me Out of My Mind), Cosmopolitan Blood Loss, ghost girl, Teenagers, Brick by Boring Brick, Dirt Off Your Shoulder, It's Not My Time, MARY JANE, Welcome to Paradise, Ghetto America (feat. Yo Gotti & Lil Durk), The Mixed Tape - 2015 Remastered, Sleep Paralysis, Jenny, Flavor Of The Weak, You Found Me, Bigger, Just Another Product, Gold Gated Villa (feat. C Dot Castro), big city blues, Strong, My Obsession, Pray For Me (with Kendrick Lamar), Fuck ALL the Pain, I Don't Like Who I Was Then, Bulletproof Love, Wanted You (feat. Lil Uzi Vert), arms, A Long Way Down, Hold Onto Me, Wait A Minute, beaten down geranium molle, Carcosa, Pose (feat. Megan Thee Stallion & Lil Uzi Vert), Blue in the Dark, That Should Be Me, Story of My Life, Jamie All Over, Splashin, Zobamambafoo (Remix) (feat. Lil Uzi Vert & Lil Yachty), Exaltation, "The Take Over, The Breaks Over", You Got Me, RAW (backwards) [feat. Zacari], How You Love Me Now
##
## Cluster 2 :
## Terry McGinnis, Tempest, Piece by Piece - Idol Version, MIA, XXXX, Cheshmhaye To, Holy Smokes, Teenage Mind, Black Panther, Pull Up (feat. NAV), Someone You Loved, God Damn, Painting Shadows, Lesbian (feat. Gunna & Young Thug), SORCERER, Sticks & Stones, The Fourth Drink Instinct, Grip, BlindHarper, I'm Not The Only One, Morrissey, Cherie, Crystals, Duck n Creep
##
## Cluster 3 :
## Mural
#Plot the PC1 against PC2 and observe the clusters
ggplot(dendro_data, aes(x=PC1, y=PC2, color=dendro_cluster1)) +
scale_color_discrete(name ="Cluster") + geom_point()
Complete Linkage with a random sample of 100 songs looked more balanced in shape compared to other linkage methods for this data, therefore it is considered for further analysis. The dendrogram with complete linkage was cut by specifying the number of clusters as 3. As shown in Figure 4, well-separated clusters were observed by plotting the primary principal component against the secondary principal component. This groups the songs with similar audio features into one cluster. It can be observed that for the random sample, a greater number of songs are grouped under cluster 1 compared to other clusters.
# Evaluate the clustering result
Actual <- subset_data$Genre
table(dendro_cluster1, Actual)
## Actual
## dendro_cluster1 Dark Trap Emo Rap
## 1 19 38 18
## 2 16 5 3
## 3 0 0 1
#Create dataset for random forest model without clusters,pcalabels,different
rf_data <- low_dim_spotify_data %>% select(-c(clusters,pcalabels,different))
#As we were solving a classification problem, the response variable was converted to a factor.
rf_data$Genre <- as.factor(rf_data$Genre)
#Split the data into train and test by considering 70% of data as training data and reserving the remaining 30% of data as test data
set.seed(5)
train <- sample(1:nrow(rf_data), nrow(rf_data)*0.7)
rf.train <- rf_data[train,]
rf.test <- rf_data[-train,]
npredictors= length(rf.train)
set.seed(5)
tic()
# Fit the Random Forest model
rf_model <- randomForest(Genre ~ ., data = rf.train, mtry = sqrt(npredictors), importance = TRUE, ntree = 100)
toc()
## 0.15 sec elapsed
#Plot the variables based on their importance
importance(rf_model)
## Dark Trap Emo Rap MeanDecreaseAccuracy MeanDecreaseGini
## PC1 11.556410 31.435981 11.121286 33.494539 120.07967
## PC2 13.255755 25.579387 33.483888 42.737446 149.55388
## PC3 7.240155 8.969038 13.869282 17.044264 68.92560
## PC4 5.906263 6.085758 8.015571 10.944466 59.29696
## PC5 11.508524 17.290965 7.058349 18.245293 83.38098
## PC6 4.190714 12.112115 1.109618 9.669738 55.16363
## PC7 2.544206 6.452140 1.056716 6.269244 39.71247
## PC8 4.061921 9.472633 7.031548 12.139988 52.11837
varImpPlot(rf_model)
#Predicting the model on test data
genre.pred <- predict(rf_model, rf.test,type = "class")
Predicted<-genre.pred
Actual<-rf.test$Genre
table(Actual,Predicted)
## Predicted
## Actual Dark Trap Emo Rap
## Dark Trap 105 19 29
## Emo 11 109 4
## Rap 15 10 104
errorrate<- mean(Actual!=Predicted)
cat("Test error rate for genre classification using random forest is ", errorrate)
## Test error rate for genre classification using random forest is 0.2167488
accuracy<- mean(Actual==Predicted)
cat("Accuracy for genre classification using random forest is ", accuracy)
## Accuracy for genre classification using random forest is 0.7832512
# Create a confusion matrix
confusion_matrix <- confusionMatrix(Predicted, Actual)
# Plot the confusion matrix
ggplot(data = as.data.frame(confusion_matrix$table),
aes(x = Prediction, y = Reference, fill = Freq)) +
geom_tile() +
geom_text(aes(label = Freq)) +
scale_fill_gradient(low = "lightblue", high = "steelblue")
The random forest model was trained and evaluated for multiclass classification to predict music genres. This model was developed by considering 100 trees and the number of predictors was equal to the square root of predictors. This model has performed with an accuracy of 83% on the test data. The confusion matrix for the predictions on the test data is as shown below. The Rap, Emo, and Dark Trap genres were correctly predicted 80%, 93%, and 77% of the time respectively.
#Create dataset for random forest model without clusters,pcalabels,different
logistic_data <- low_dim_spotify_data %>% select(-c(clusters,pcalabels,different))
#As we were solving a classification problem, the response variable was converted to a factor.
logistic_data$Genre <- as.factor(logistic_data$Genre)
#Split the data into train and test by considering 70% of data as training data and reserving the remaining 30% of data as test data
set.seed(5)
train <- sample(1:nrow(logistic_data), nrow(logistic_data)*0.7)
logistic.train <- logistic_data[train,]
logistic.test <- logistic_data[-train,]
set.seed(5)
tic()
# Fit the multinomial logistic regression
multi_model <- multinom(Genre ~ ., data = logistic.train)
## # weights: 30 (18 variable)
## initial value 1037.090001
## iter 10 value 623.051344
## iter 20 value 537.236931
## iter 30 value 511.529893
## iter 30 value 511.529892
## iter 30 value 511.529892
## final value 511.529892
## converged
summary(multi_model)
## Call:
## multinom(formula = Genre ~ ., data = logistic.train)
##
## Coefficients:
## (Intercept) PC1 PC2 PC3 PC4 PC5 PC6
## Emo -0.5392527 -1.0912544 0.1359434 -0.3662849 0.9374202 -1.856535 1.0949008
## Rap -0.6499806 -0.3701302 -1.8290091 0.6789540 1.2774312 -1.234256 0.8865475
## PC7 PC8
## Emo -0.6230031 0.06619405
## Rap -0.1289989 -1.28017073
##
## Std. Errors:
## (Intercept) PC1 PC2 PC3 PC4 PC5 PC6
## Emo 0.1574254 0.09915927 0.1321721 0.1338781 0.1320180 0.1685973 0.1365585
## Rap 0.1926754 0.10353979 0.1998581 0.1331620 0.1386139 0.2169624 0.1558667
## PC7 PC8
## Emo 0.1535821 0.1613028
## Rap 0.1422117 0.2617964
##
## Residual Deviance: 1023.06
## AIC: 1059.06
toc()
## 0.04 sec elapsed
# Make predictions on the test data
predictions <- predict(multi_model, newdata = logistic.test, type = "class")
#Predicting the model on test data
predictions <- predict(multi_model, logistic.test,type = "class")
Predicted<-predictions
Actual<-logistic.test$Genre
table(Actual,Predicted)
## Predicted
## Actual Dark Trap Emo Rap
## Dark Trap 97 27 29
## Emo 9 107 8
## Rap 15 14 100
errorrate<- mean(Actual!=Predicted)
cat("Test error rate for genre classification using logistic regression is ", errorrate)
## Test error rate for genre classification using logistic regression is 0.2512315
accuracy<- mean(Actual==Predicted)
cat("Accuracy for genre classification using logistic regression is ", accuracy)
## Accuracy for genre classification using logistic regression is 0.7487685
# Create a confusion matrix
confusion_matrix <- confusionMatrix(Predicted, Actual)
# Plot the confusion matrix
ggplot(data = as.data.frame(confusion_matrix$table),
aes(x = Prediction, y = Reference, fill = Freq)) +
geom_tile() +
geom_text(aes(label = Freq)) +
scale_fill_gradient(low = "lightblue", high = "steelblue")
The low-dimensional data from PCA was utilized to train a logistic regression model to predict the genre of the songs. Figure.5 depicts this model’s confusion matrix. On the test data, this model has predicted an accuracy of 77%. For Rap and Emo songs, the genre was correctly predicted nearly 80 percent of the time. And the performance rate for predicting the “Dark Trap” songs was 65%. It can be observed that the model performs moderately for the “Dark Trap” classification.